Hierarchical Clustering¶

  1. Read the dataset
  2. Data investigation
  3. Data preprocessing
  4. Features transformation
  5. K means
  6. Training datasets
  7. Improvement ideas
In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

1. Read the dataset¶

(Go to top)

First dowmload the data set from this link https://www.kaggle.com/code/sadkoktaybicici/credit-card-data-clustering-k-mean/data then import it in python.

In [11]:
#read the data

data_path = '/content/CC GENERAL.csv'  #the path where you downloaded the data
df = pd.read_csv(data_path)

print('The shape of the dataset is:', df.shape)
The shape of the dataset is: (8950, 18)

2. Data investigation¶

(Go to top)

in this part you need to check the data quality and assess any issues in the data as:

  • null values in each column
  • each column has the proper data type
  • outliers
  • duplicate rows
  • distribution for each column (skewness)

comment each issue you find

In [12]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8950 entries, 0 to 8949
Data columns (total 18 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   CUST_ID                           8950 non-null   object 
 1   BALANCE                           8950 non-null   float64
 2   BALANCE_FREQUENCY                 8950 non-null   float64
 3   PURCHASES                         8950 non-null   float64
 4   ONEOFF_PURCHASES                  8950 non-null   float64
 5   INSTALLMENTS_PURCHASES            8950 non-null   float64
 6   CASH_ADVANCE                      8950 non-null   float64
 7   PURCHASES_FREQUENCY               8950 non-null   float64
 8   ONEOFF_PURCHASES_FREQUENCY        8950 non-null   float64
 9   PURCHASES_INSTALLMENTS_FREQUENCY  8950 non-null   float64
 10  CASH_ADVANCE_FREQUENCY            8950 non-null   float64
 11  CASH_ADVANCE_TRX                  8950 non-null   int64  
 12  PURCHASES_TRX                     8950 non-null   int64  
 13  CREDIT_LIMIT                      8949 non-null   float64
 14  PAYMENTS                          8950 non-null   float64
 15  MINIMUM_PAYMENTS                  8637 non-null   float64
 16  PRC_FULL_PAYMENT                  8950 non-null   float64
 17  TENURE                            8950 non-null   int64  
dtypes: float64(14), int64(3), object(1)
memory usage: 1.2+ MB
In [13]:
df.describe()
Out[13]:
BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
count 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8949.000000 8950.000000 8637.000000 8950.000000 8950.000000
mean 1564.474828 0.877271 1003.204834 592.437371 411.067645 978.871112 0.490351 0.202458 0.364437 0.135144 3.248827 14.709832 4494.449450 1733.143852 864.206542 0.153715 11.517318
std 2081.531879 0.236904 2136.634782 1659.887917 904.338115 2097.163877 0.401371 0.298336 0.397448 0.200121 6.824647 24.857649 3638.815725 2895.063757 2372.446607 0.292499 1.338331
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 50.000000 0.000000 0.019163 0.000000 6.000000
25% 128.281915 0.888889 39.635000 0.000000 0.000000 0.000000 0.083333 0.000000 0.000000 0.000000 0.000000 1.000000 1600.000000 383.276166 169.123707 0.000000 12.000000
50% 873.385231 1.000000 361.280000 38.000000 89.000000 0.000000 0.500000 0.083333 0.166667 0.000000 0.000000 7.000000 3000.000000 856.901546 312.343947 0.000000 12.000000
75% 2054.140036 1.000000 1110.130000 577.405000 468.637500 1113.821139 0.916667 0.300000 0.750000 0.222222 4.000000 17.000000 6500.000000 1901.134317 825.485459 0.142857 12.000000
max 19043.138560 1.000000 49039.570000 40761.250000 22500.000000 47137.211760 1.000000 1.000000 1.000000 1.500000 123.000000 358.000000 30000.000000 50721.483360 76406.207520 1.000000 12.000000
In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

column_of_interest = 'BALANCE'
selected_data = df[[column_of_interest]].dropna()

bar_color = 'lightgreen'
edge_color = 'black'
annotation_color = 'blue'

plt.figure(figsize=(6, 4))
sns.histplot(selected_data[column_of_interest], bins=5, kde=False, color=bar_color, edgecolor=edge_color)
plt.xlabel(f'{column_of_interest}')
plt.ylabel('Count')
plt.title(f'Histogram of {column_of_interest}')

counts, bins = np.histogram(selected_data[column_of_interest], bins=5)
percentages = (counts / len(selected_data[column_of_interest])) * 100

for count, bin_edge, percentage in zip(counts, bins[:-1], percentages):
    plt.annotate(f'{percentage:.3f}%', xy=(bin_edge + np.diff(bins)[0] / 2, count), xytext=(0, 1),
                 textcoords='offset points', ha='center', va='bottom', fontsize=8, color=annotation_color)

plt.tight_layout()
plt.show()

most customer have balance more than 2500 and less than 5000

In [15]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

column_of_interest = 'PURCHASES'
selected_data = df[[column_of_interest]].dropna()

bar_color = 'lightgreen'
edge_color = 'black'
annotation_color = 'blue'

plt.figure(figsize=(6, 4))
sns.histplot(selected_data[column_of_interest], bins=5, kde=False, color=bar_color, edgecolor=edge_color)
plt.xlabel(f'{column_of_interest}')
plt.ylabel('Count')
plt.title(f'Histogram of {column_of_interest}')

counts, bins = np.histogram(selected_data[column_of_interest], bins=5)
percentages = (counts / len(selected_data[column_of_interest])) * 100

for count, bin_edge, percentage in zip(counts, bins[:-1], percentages):
    plt.annotate(f'{percentage:.3f}%', xy=(bin_edge + np.diff(bins)[0] / 2, count), xytext=(0, 1),
                 textcoords='offset points', ha='center', va='bottom', fontsize=8, color=annotation_color)

plt.tight_layout()
plt.show()
In [16]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

column_of_interest = 'CASH_ADVANCE'
selected_data = df[[column_of_interest]].dropna()

bar_color = 'lightgreen'
edge_color = 'black'
annotation_color = 'blue'

plt.figure(figsize=(6, 4))
sns.histplot(selected_data[column_of_interest], bins=5, kde=False, color=bar_color, edgecolor=edge_color)
plt.xlabel(f'{column_of_interest}')
plt.ylabel('Count')
plt.title(f'Histogram of {column_of_interest}')

counts, bins = np.histogram(selected_data[column_of_interest], bins=5)
percentages = (counts / len(selected_data[column_of_interest])) * 100

for count, bin_edge, percentage in zip(counts, bins[:-1], percentages):
    plt.annotate(f'{percentage:.3f}%', xy=(bin_edge + np.diff(bins)[0] / 2, count), xytext=(0, 1),
                 textcoords='offset points', ha='center', va='bottom', fontsize=8, color=annotation_color)

plt.tight_layout()
plt.show()

most 98.227 do cash advance

In [17]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

column_of_interest = 'BALANCE_FREQUENCY'
selected_data = df[[column_of_interest]].dropna()

bar_color = 'lightgreen'
edge_color = 'black'
annotation_color = 'blue'

plt.figure(figsize=(6, 4))
sns.histplot(selected_data[column_of_interest], bins=5, kde=False, color=bar_color, edgecolor=edge_color)
plt.xlabel(f'{column_of_interest}')
plt.ylabel('Count')
plt.title(f'Histogram of {column_of_interest}')

counts, bins = np.histogram(selected_data[column_of_interest], bins=5)
percentages = (counts / len(selected_data[column_of_interest])) * 100

for count, bin_edge, percentage in zip(counts, bins[:-1], percentages):
    plt.annotate(f'{percentage:.3f}%', xy=(bin_edge + np.diff(bins)[0] / 2, count), xytext=(0, 1),
                 textcoords='offset points', ha='center', va='bottom', fontsize=8, color=annotation_color)

plt.tight_layout()
plt.show()
In [18]:
df.duplicated().sum()
Out[18]:
0
In [19]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams

for col in list(df.columns)[1:]:
      plt.figure()
      rcParams['figure.figsize'] = (20, 3)
      fig, ax_box = plt.subplots()
      plt.gca().set(xlabel=col, ylabel='Density')
      sns.boxplot(x=col, linewidth=1.0, palette='Blues', data=df, ax=ax_box)
      plt.show()
<Figure size 640x480 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
In [20]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams

for col in list(df.columns)[1:]:
      plt.figure()
      rcParams['figure.figsize'] = (20, 3)
      fig, ax_kde = plt.subplots()
      plt.gca().set(xlabel=col, ylabel='Density')
      sns.kdeplot(df[col], fill=True, palette='Blues', ax=ax_kde)
      plt.show()
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
In [21]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(df.corr())
plt.show()
In [22]:
plt.figure(figsize=(20,10))
coorlation = df.iloc[:,1:].corr()
thresh = 0.6
sns.heatmap(coorlation[abs(coorlation) > thresh] , annot=True)
Out[22]:
<Axes: >

3. Data preprocessing¶

(Go to top)

Define below all the issues that you had found in the previous part¶

1-
2-
3-

In [23]:
#make a copy for the original dataset
df_copy=df.copy()

for each issue adapt this methodology¶

  • start by defining the solution
  • apply this solution onn the data
  • test the solution to make sure that you have solved the issue
In [24]:
df_copy.drop('CUST_ID', axis=1, inplace=True)

First issue

In [25]:
#solution
df_copy.isnull().sum()
Out[25]:
BALANCE                               0
BALANCE_FREQUENCY                     0
PURCHASES                             0
ONEOFF_PURCHASES                      0
INSTALLMENTS_PURCHASES                0
CASH_ADVANCE                          0
PURCHASES_FREQUENCY                   0
ONEOFF_PURCHASES_FREQUENCY            0
PURCHASES_INSTALLMENTS_FREQUENCY      0
CASH_ADVANCE_FREQUENCY                0
CASH_ADVANCE_TRX                      0
PURCHASES_TRX                         0
CREDIT_LIMIT                          1
PAYMENTS                              0
MINIMUM_PAYMENTS                    313
PRC_FULL_PAYMENT                      0
TENURE                                0
dtype: int64
In [26]:
df_copy['CREDIT_LIMIT'] = df_copy['CREDIT_LIMIT'].fillna(df_copy['CREDIT_LIMIT'].median())
In [27]:
#test
df_copy.isnull().sum()
Out[27]:
BALANCE                               0
BALANCE_FREQUENCY                     0
PURCHASES                             0
ONEOFF_PURCHASES                      0
INSTALLMENTS_PURCHASES                0
CASH_ADVANCE                          0
PURCHASES_FREQUENCY                   0
ONEOFF_PURCHASES_FREQUENCY            0
PURCHASES_INSTALLMENTS_FREQUENCY      0
CASH_ADVANCE_FREQUENCY                0
CASH_ADVANCE_TRX                      0
PURCHASES_TRX                         0
CREDIT_LIMIT                          0
PAYMENTS                              0
MINIMUM_PAYMENTS                    313
PRC_FULL_PAYMENT                      0
TENURE                                0
dtype: int64

Second issue

In [28]:
#solution
df_copy.drop('ONEOFF_PURCHASES' ,  axis=1 , inplace=True)
df_copy.drop('MINIMUM_PAYMENTS' ,  axis=1 , inplace=True)
In [29]:
df_log=df_copy.copy()
In [30]:
df_log.head()
Out[30]:
BALANCE BALANCE_FREQUENCY PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS PRC_FULL_PAYMENT TENURE
0 40.900749 0.818182 95.40 95.4 0.000000 0.166667 0.000000 0.083333 0.000000 0 2 1000.0 201.802084 0.000000 12
1 3202.467416 0.909091 0.00 0.0 6442.945483 0.000000 0.000000 0.000000 0.250000 4 0 7000.0 4103.032597 0.222222 12
2 2495.148862 1.000000 773.17 0.0 0.000000 1.000000 1.000000 0.000000 0.000000 0 12 7500.0 622.066742 0.000000 12
3 1666.670542 0.636364 1499.00 0.0 205.788017 0.083333 0.083333 0.000000 0.083333 1 1 7500.0 0.000000 0.000000 12
4 817.714335 1.000000 16.00 0.0 0.000000 0.083333 0.083333 0.000000 0.000000 0 1 1200.0 678.334763 0.000000 12

4. Features transformation¶

(Go to top)

What is the feature scaling technique that would use and why?
return to this section again and try another technique and see how that will impact your result
for more details on different methods for scaling check these links

  • https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
  • https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
  • https://www.analyticsvidhya.com/blog/2020/07/types-of-feature-transformation-and-scaling/

Answer here:

In [31]:
from sklearn import preprocessing
In [32]:
df_copy1=df_copy
In [33]:
df_copy1.skew()
Out[33]:
BALANCE                             2.393386
BALANCE_FREQUENCY                  -2.023266
PURCHASES                           8.144269
INSTALLMENTS_PURCHASES              7.299120
CASH_ADVANCE                        5.166609
PURCHASES_FREQUENCY                 0.060164
ONEOFF_PURCHASES_FREQUENCY          1.535613
PURCHASES_INSTALLMENTS_FREQUENCY    0.509201
CASH_ADVANCE_FREQUENCY              1.828686
CASH_ADVANCE_TRX                    5.721298
PURCHASES_TRX                       4.630655
CREDIT_LIMIT                        1.522636
PAYMENTS                            5.907620
PRC_FULL_PAYMENT                    1.942820
TENURE                             -2.943017
dtype: float64

log transformation

In [34]:
col = ['BALANCE', 'PURCHASES',
       'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
       'CASH_ADVANCE_TRX', 'PURCHASES_TRX',
       'CREDIT_LIMIT', 'PAYMENTS']
for c in col:
    df_copy1[c] = df_copy1[c] + 1e-4

    df_copy1[c] = np.log(df_copy1[c])
In [35]:
col = df_copy1.columns
for i in range(len(col)):
    plt.figure(figsize=(20,5))
    sns.histplot(data = df_copy1,x = col[i])
    plt.show()
In [41]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler , RobustScaler

function draw tsne

In [39]:
def draw_tsne(data , c=None):
    tsne = TSNE(n_components=2)
    tsne_data = tsne.fit_transform(data)

    plt.figure(figsize=(15,5))
    plt.scatter(tsne_data[:, 0], tsne_data[:, 1],c=c ,  cmap='viridis')
    plt.title('tsne Cluster Visualization')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.colorbar(label='Cluster')
    plt.show()
In [32]:
draw_tsne(df_copy1)

standerd scaler

In [27]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_copy1)

robust scaler

In [28]:
scaler = RobustScaler()
data_scaled2 = scaler.fit_transform(df_copy1)
In [29]:
draw_tsne(data_scaled)
In [30]:
draw_tsne(data_scaled2)
In [31]:
def pca_without_kernel(data):
    pca = PCA(n_components=2)
    pca_data = pca.fit_transform(data)

    plt.figure(figsize=(8, 6))
    plt.scatter(pca_data[:, 0], pca_data[:, 1])
    plt.title('PCA without Kernel')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.show()

pca data without kernel

In [32]:
pca_without_kernel(df_copy1)

pca data with different 3 kernal

In [33]:
from sklearn.decomposition import KernelPCA
def pca_with_kernel_rbf(data):
    kpca = KernelPCA(n_components=2, kernel='rbf')
    kpca_data = kpca.fit_transform(data)

    plt.figure(figsize=(8, 6))
    plt.scatter(kpca_data[:, 0], kpca_data[:, 1])
    plt.title('PCA with Kernel (rbf)')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.show()
In [34]:
pca_with_kernel_rbf(df_copy1)
In [35]:
from sklearn.decomposition import KernelPCA
def pca_with_kernel_poly(data):
    kpca = KernelPCA(n_components=2, kernel='poly')
    kpca_data = kpca.fit_transform(data)

    plt.figure(figsize=(8, 6))
    plt.scatter(kpca_data[:, 0], kpca_data[:, 1])
    plt.title('PCA with Kernel (poly)')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.show()
In [36]:
pca_with_kernel_poly(df_copy1)
In [37]:
from sklearn.decomposition import KernelPCA
def pca_with_kernel_cosine(data):
    kpca = KernelPCA(n_components=2, kernel='cosine')
    kpca_data = kpca.fit_transform(data)

    plt.figure(figsize=(8, 6))
    plt.scatter(kpca_data[:, 0], kpca_data[:, 1])
    plt.title('PCA with Kernel (cosine)')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.show()
    return kpca_data
In [38]:
data_pca=pca_with_kernel_cosine(df_copy1)

Best pca with cosine kernal

In [170]:
data_pca.shape
Out[170]:
(8950, 2)

5. K Means¶

(Go to top)

In [ ]:
 

1- Use the k means class that you implemented in the previous task to cluster this data 2- Use http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html and see if the difference in the result 3- Use elbow method to determine the K (plot the result using two plot one for distorion and another for inertia) 4- (Optionally) make a method that pick the best number of clusters for you 5- Using different techniques for scaling and comment on the result

In [171]:
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans

def determine_k(data_scaled):
    wcss = []
    distortions=[]
    sil=[]
    for i in range(1,15):
        km = KMeans(n_clusters=i , random_state=42)
        km.fit(data_scaled)
        wcss.append(km.inertia_)
        distortions.append(sum(np.min(cdist(data_scaled, km.cluster_centers_, 'euclidean'), axis=1)**2) / data_scaled.shape[0])
        if i > 1:
            sil.append(silhouette_score(data_scaled, km.labels_, metric = 'euclidean'))

    return wcss , distortions , sil
In [172]:
wcss , distortions , sil = determine_k(df_copy1)

plt.figure(figsize=(10,5))
plt.plot(range(1,15),wcss, marker='o', linestyle='--')

plt.title('inertia vs epochs')
plt.xlabel('number of clusters')
plt.ylabel('inertia')
plt.show()


plt.figure(figsize=(11,5))
plt.plot(range(1,15),distortions, marker='o', linestyle='--')

plt.title('distortion vs epochs')
plt.xlabel('number of clusters')
plt.ylabel('distortion')
plt.show()

plt.figure(figsize=(11,5))
plt.plot(range(2,15),sil, marker='o', linestyle='--')

plt.title('silhouette_score vs epochs')
plt.xlabel('number of clusters')
plt.ylabel('silhouette_score')
plt.show()
In [176]:
wcss
Out[176]:
[2122353.025870178,
 997702.9910813475,
 493801.7906112459,
 301795.10287595977,
 180279.25477490854,
 151318.14017593587,
 133708.52134561515,
 117374.162711114,
 109376.12092573436,
 102381.19408545103,
 100205.00433687813,
 93159.57653597393,
 89347.42865137184,
 83188.24049753523]
In [174]:
!pip install kneed
Collecting kneed
  Downloading kneed-0.8.5-py3-none-any.whl (10 kB)
Requirement already satisfied: numpy>=1.14.2 in /usr/local/lib/python3.10/dist-packages (from kneed) (1.25.2)
Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from kneed) (1.11.4)
Installing collected packages: kneed
Successfully installed kneed-0.8.5
In [175]:
from kneed import KneeLocator
kl = KneeLocator(range(1, 15), wcss, curve="convex", direction="decreasing")
kl.elbow
Out[175]:
4
In [177]:
from kneed import KneeLocator
kl = KneeLocator(range(1, 15), distortions, curve="convex", direction="decreasing")
kl.elbow
Out[177]:
4

including the previous information i choose 5 cluster

In [28]:
from sklearn.cluster import KMeans
kmeans1 = KMeans(n_clusters=5 )
kmeans1.fit(df_copy1)
Out[28]:
KMeans(n_clusters=5)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=5)
In [29]:
dfnew = df_copy1.copy()
dfnew['cluster1'] = kmeans1.predict(df_copy1)
In [33]:
draw_tsne(df_copy1 , kmeans1.predict(df_copy1))
In [34]:
df_log.head()
Out[34]:
BALANCE BALANCE_FREQUENCY PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS PRC_FULL_PAYMENT TENURE
0 40.900749 0.818182 95.40 95.4 0.000000 0.166667 0.000000 0.083333 0.000000 0 2 1000.0 201.802084 0.000000 12
1 3202.467416 0.909091 0.00 0.0 6442.945483 0.000000 0.000000 0.000000 0.250000 4 0 7000.0 4103.032597 0.222222 12
2 2495.148862 1.000000 773.17 0.0 0.000000 1.000000 1.000000 0.000000 0.000000 0 12 7500.0 622.066742 0.000000 12
3 1666.670542 0.636364 1499.00 0.0 205.788017 0.083333 0.083333 0.000000 0.083333 1 1 7500.0 0.000000 0.000000 12
4 817.714335 1.000000 16.00 0.0 0.000000 0.083333 0.083333 0.000000 0.000000 0 1 1200.0 678.334763 0.000000 12

after I applied all algorithm in the notebook i found k means the best one

In [35]:
inves_df = df_log.copy()
inves_df['Cluster'] = kmeans1.labels_
In [42]:
feature_columns = ['BALANCE','BALANCE_FREQUENCY','PURCHASES' ,
                   'INSTALLMENTS_PURCHASES','CASH_ADVANCE','PURCHASES_FREQUENCY','ONEOFF_PURCHASES_FREQUENCY',
                   'PURCHASES_INSTALLMENTS_FREQUENCY','CASH_ADVANCE_FREQUENCY',
                   'CASH_ADVANCE_TRX','PURCHASES_TRX','CREDIT_LIMIT','PAYMENTS','PRC_FULL_PAYMENT','TENURE']

clusters = inves_df['Cluster'].unique()


num_rows = len(feature_columns)
num_cols = len(clusters)
fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 7*num_rows))

for i, col in enumerate(feature_columns):
    for j, cluster in enumerate(clusters):
        data = inves_df[inves_df['Cluster'] == cluster][col]
        # Plot histogram
        sns.histplot(data, bins=5, ax=axes[i, j], stat='percent')
        axes[i, j].set_title(f'Cluster {cluster}')
        axes[i, j].set_xlabel(col)
        axes[i, j].set_ylabel('Percent')

plt.tight_layout()
plt.show()
In [43]:
inves_df.groupby('Cluster')[feature_columns].mean()
Out[43]:
BALANCE BALANCE_FREQUENCY PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS PRC_FULL_PAYMENT TENURE
Cluster
0 762.631611 0.769844 881.062444 0.000000 0.000000 0.354988 0.354366 0.000311 0.000000 0.000000 7.725746 4424.077058 1199.907641 0.135739 11.557836
1 2340.086599 0.925295 659.884782 0.000000 2031.598844 0.275644 0.275540 0.000000 0.293253 6.843088 6.278954 4637.888600 1918.729797 0.057814 11.352428
2 2152.064324 0.882442 0.000000 0.042560 1989.715951 0.000000 0.000000 0.000122 0.272724 6.299559 0.001958 4029.009626 1653.844906 0.044346 11.318160
3 2749.022276 0.964807 1496.934539 697.550495 2076.241800 0.739190 0.272882 0.621509 0.282441 7.257453 24.798780 5353.666064 2682.909852 0.062290 11.523713
4 801.798310 0.859506 1488.982413 745.046102 0.000000 0.758072 0.227245 0.659107 0.000000 0.000000 22.981440 4393.620698 1503.322940 0.281572 11.654106

business meaning Customers in Cluster 0: they don't use CASH_ADVANCE_TRX ,cash_advance and installments_purchases and they have lowest balance

Customers in Cluster 1: highest balance,make moderate purchases, rarely use installment plans, but frequently rely on cash advances, they have near to zero PURCHASES_INSTALLMENTS_FREQUENCY

Customers in Cluster 2: Customers with a moderate balance with minimal purchasing activity but with frequent cash advances

Customers in Cluster 3: Customers have a relatively high balance and engage in significant purchasing activities, with a portion spent on installments, along with recurring cash advances

Customers in Cluster 4: Customers have moderate credit and frequent purchasing activities, especially one-time purchases and installments, with occasional cash advances.

6. improvement ideas¶

(Go to top)

In [204]:
data_pca1 = pd.DataFrame(data_pca)
In [207]:
data_pca1
Out[207]:
0 1
0 -0.485378 -0.165015
1 0.792745 -0.101441
2 -0.179049 -0.353550
3 0.330530 0.025923
4 -0.097485 -0.474160
... ... ...
8945 -0.616313 -0.125911
8946 -0.622973 -0.129284
8947 -0.613403 -0.177269
8948 0.857915 -0.391939
8949 0.335955 0.153192

8950 rows × 2 columns

k means with pca data

In [209]:
from sklearn.cluster import KMeans
kmeans2 = KMeans(n_clusters=5 )
kmeans2.fit(data_pca1)
Out[209]:
KMeans(n_clusters=5)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=5)
In [210]:
cluster_assignments = kmeans2.predict(data_pca1)
In [211]:
draw_tsne(data_pca1 , kmeans2.predict(data_pca1))

DBSCAN on log transform data

In [36]:
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import numpy as np

k = 5
nbrs = NearestNeighbors(n_neighbors=k).fit(df_copy1)
distances, _ = nbrs.kneighbors(df_copy1)
k_distances = distances[:, -1]

k_distances_sorted = np.sort(k_distances)[::-1]

plt.figure(figsize=(10,10))
plt.plot(np.arange(len(k_distances_sorted[::-1])), k_distances_sorted[::-1])
plt.xlabel('Data Points (sorted by distance)')
plt.ylabel(f'{k}-distance')
plt.title(f'{k}-Distance Graph')
plt.ylim(0,3)
plt.show()
In [37]:
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=4, min_samples=500).fit(df_copy1)
In [42]:
draw_tsne(df_copy1 , c=clustering.labels_)
In [243]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=4, random_state=42)
gmm.fit(df_copy1)

labels = gmm.predict(df_copy1)
In [245]:
df_gaussian =df_copy1.copy()
In [246]:
df_gaussian['expectation clusters'] = labels
In [247]:
draw_tsne(df_copy1 , c=labels)

Isolation Forest

In [257]:
from sklearn.ensemble import IsolationForest
In [261]:
#Use isolation forest for anomaly detection


isolationforest = IsolationForest()
isolationforest.fit(df_copy1)
outlier_preds = isolationforest.predict(df_copy1)
anomalies = df_copy1[outlier_preds == -1]

kmeans = KMeans(n_clusters=5)
kmeans.fit(df_copy1)
labels = kmeans.labels_

draw_tsne(df_copy1, labels)

6. Training and hyperparamter tuning¶

(Go to top)

Before we start the training process we need to specify 3 paramters:
1- Linkage criteria : The linkage criterion determines the distance between two clusters

- Complete-Linkage Clustering
- Single-Linkage Clustering
- Average-Linkage Clustering
- Centroid Linkage Clustering

2- Distance function:

- Euclidean Distance
- Manhattan Distance
- Mahalanobis distance

3- Number of clusters

Number of clusters¶

Use Dendograms to specify the optimum number of clusters

  • Compare how changing linkage criteria or distance function would affect the optimum number of clusters
  • you can use silhouette_score or any other evalution method to help you determine the optimum number of clusters https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
In [ ]:
import scipy.cluster.hierarchy as shc

plt.figure(figsize=(10, 7))
plt.title("Counters Dendograms")
dend = shc.dendrogram(shc.linkage(y=... , method=...,metric=...),orientation='right') #fill y with your dataframe
                                                                                      #and method with linkage criteria
                                                                                      #and metric with distance function
In [42]:
import scipy.cluster.hierarchy as shc

plt.figure(figsize=(20, 20))
plt.title("Counters Dendograms")
dend = shc.dendrogram(shc.linkage(y=df_copy1 , method='complete',metric='euclidean'))
In [44]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
tsne = TSNE(n_components=2)
tsne_data = tsne.fit_transform(df_copy1)

from sklearn.cluster import AgglomerativeClustering
n_clusters = np.arange(3,6)
inertia = []
for i in n_clusters:
    agg = AgglomerativeClustering(n_clusters=i, linkage='complete', affinity='euclidean').fit(df_copy1)
    print("seliouette score for linkage = {} and distance = {} and n_clusters = {} is {}".format('complete', 'euclidean', i, silhouette_score(df_copy1, agg.labels_)))
    sns.scatterplot(x=tsne_data[:,:1].reshape(-1),y = tsne_data[:,1:].reshape(-1),hue=agg.labels_,palette='viridis')
    plt.title('linkage = {} and distance = {}'.format('complete', 'euclidean'))
    silhouette_score(df_copy1, agg.fit_predict(df_copy1))
    plt.show()
seliouette score for linkage = complete and distance = euclidean and n_clusters = 3 is 0.613880263469066
seliouette score for linkage = complete and distance = euclidean and n_clusters = 4 is 0.6545578234374232
seliouette score for linkage = complete and distance = euclidean and n_clusters = 5 is 0.6623238130728172
In [242]:
tsne = TSNE(n_components=2)
tsne_data = tsne.fit_transform(df_copy1)

from sklearn.cluster import AgglomerativeClustering
n_clusters = np.arange(3,6)
inertia = []
for i in n_clusters:
    agg = AgglomerativeClustering(n_clusters=i, linkage='average', affinity='manhattan').fit(df_copy1)
    print("seliouette score for linkage = {} and distance = {} and n_clusters = {} is {}".format('average', 'manhattan', i, silhouette_score(df_copy1, agg.labels_)))
    sns.scatterplot(x=tsne_data[:,:1].reshape(-1),y = tsne_data[:,1:].reshape(-1),hue=agg.labels_,palette='viridis')
    plt.title('linkage = {} and distance = {}'.format('average', 'manhattan'))
    silhouette_score(df_copy1, agg.fit_predict(df_copy1))
    plt.show()
seliouette score for linkage = average and distance = manhattan and n_clusters = 3 is 0.6137830198848035
seliouette score for linkage = average and distance = manhattan and n_clusters = 4 is 0.5753685883988677
seliouette score for linkage = average and distance = manhattan and n_clusters = 5 is 0.56803031747135
In [ ]:
#training
from sklearn.cluster import AgglomerativeClustering
  • Try to use PCA to reduce the number of features and compare how this will affect the clustring process
  • Try to run your code again but with different tranformation technique
  • Implement gap statistics method and use it as evaluation metric and compare the result with what you did before https://www.datanovia.com/en/lessons/determining-the-optimal-number-of-clusters-3-must-know-methods/#gap-statistic-method